Time series, the course I often wish I had taken while completing my coursework in school. I finally got an excuse to do a comparitive dive into the different time series models in the forcast package in R thanks to an invitation to present at a recent Practical Data Science Meetup in Salt Lake City.

library(tidyverse)
library(gridExtra)
library(lubridate)
library(leaflet)
library(randomForest)
library(forecast)
library(prophet)
# Visualize sensor locations
w.sensors <- weather %>% distinct(LATITUDE,LONGITUDE)
pm.sensors <- pm25 %>%
  # Remove data from sensors that don't span entire time period
  filter(!(AQS_SITE_ID %in% c(490353013,490450003))) %>%
  distinct(AQS_SITE_ID,SITE_LATITUDE,SITE_LONGITUDE) %>% 
  rename(LATITUDE=SITE_LATITUDE, 
         LONGITUDE=SITE_LONGITUDE)

w.icons <- awesomeIcons(
  icon = 'tint',
  iconColor = 'blue',
  markerColor = "white"
)
pm.icons <- awesomeIcons(
  icon = 'cloud',
  iconColor = 'gray',
  markerColor = "black"
)
m <- leaflet() %>% 
  addProviderTiles(providers$Esri.NatGeoWorldMap) %>%
  setView(-112, 40.7, zoom = 10) %>% 
  addAwesomeMarkers(data=w.sensors, icon = w.icons) %>% 
  addAwesomeMarkers(data=pm.sensors, icon = pm.icons,label = ~as.character(AQS_SITE_ID))
m

OLS Regression

fit1 <- lm(sqrt(pm2.5)~inversion+wind+precip+fireworks,data=dat)
summary(fit1)
## 
## Call:
## lm(formula = sqrt(pm2.5) ~ inversion + wind + precip + fireworks, 
##     data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1571 -0.5555 -0.1835  0.3608  4.4629 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.322431   0.066358  50.068  < 2e-16 ***
## inversion    2.527237   0.130122  19.422  < 2e-16 ***
## wind        -0.040543   0.003255 -12.454  < 2e-16 ***
## precip      -0.515741   0.175563  -2.938  0.00336 ** 
## fireworks    0.545624   0.116089   4.700 2.85e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8791 on 1456 degrees of freedom
## Multiple R-squared:  0.3165, Adjusted R-squared:  0.3146 
## F-statistic: 168.5 on 4 and 1456 DF,  p-value: < 2.2e-16
dat$resid[!is.na(dat$pm2.5)] <- resid(fit1)

# Plot the residuals
ggplot(dat,aes(date,resid)) + 
  geom_point() + geom_smooth() +
  ggtitle("Linear Regression Residuals",
          subtitle = paste0("RMSE: ",round(sqrt(mean(dat$resid^2,na.rm=TRUE)),2)))

Acf(dat$resid, main="ACF of OLS Residuals")

Random Forest Regression

fit2 <- randomForest(sqrt(pm2.5)~inversion+wind+precip+fireworks,data=dat[!is.na(dat$pm2.5),], ntree=500)
dat$rf.resid[!is.na(dat$pm2.5)] <- fit2$predicted - sqrt(dat$pm2.5[!is.na(dat$pm2.5)])

# Plot the residuals
ggplot(dat,aes(date,rf.resid)) + 
  geom_point() + geom_smooth() +
  ggtitle("Random Forest Residuals",
          subtitle = paste0("RMSE: ",round(sqrt(fit2$mse[500]),2)))

# Better but we still have some odd things going on in our data

# Zoom In
p1 <- ggplot(dat,aes(date,rf.resid)) + 
  geom_point() + geom_line() +
  xlim(as.Date(c("2014-01-01","2014-02-28"))) + 
  geom_abline(slope=0, intercept = 0, lty=2, col = "blue", lwd = 1.25)

p2 <- ggplot(dat,aes(date,rf.resid)) + 
  geom_point() + geom_line() +
  xlim(as.Date(c("2017-11-01","2017-12-31"))) + 
  geom_abline(slope=0, intercept = 0, lty=2, col = "blue", lwd = 1.25)


grid.arrange(p1, p2, ncol=2, top="Zoom-in of Random Forest Residuals")

Exponential Smoothing

# Convert to time series data
dat.ts <- sqrt(ts(dat[,"pm2.5"], frequency = 7))

# Exponential smoothing model with weekly seasonality
fit3 <- ets(dat.ts) # model = "MAN"
fit4a <- ets(dat.ts, model ="AAA")
fit4b <- ets(dat.ts, model ="MMM")
# Fit models with all additive or all multiplicative features. First byte is for errors, second for trend, and third for seasonality

# Predict Future Values
plot(predict(fit3,h=25),xlim=c(200,215))

ets.mod <- rbind(data.frame(day=1:sum(!is.na(dat.ts)), resid=as.numeric(residuals(fit3)), type="Auto"),
                 data.frame(day=1:sum(!is.na(dat.ts)), resid=as.numeric(residuals(fit4a)), type="Additive"),
                 data.frame(day=1:sum(!is.na(dat.ts)), resid=as.numeric(residuals(fit4b)), type="Multiplicative"))

# Compare the residuals of each model
ggplot(ets.mod,aes(day,resid)) + 
  geom_point() + geom_smooth() + 
  facet_grid(type~.,scales="free")+
  ggtitle("ETS Residuals with Weekly Seasonality",
          subtitle = paste0("Auto RMSE: ",round(sqrt(fit3$mse),2),
                            "   Additive RMSE: ",round(sqrt(fit4a$mse),2),
                            "   Multiplicative RMSE: ",round(sqrt(fit4b$mse),2)))

TBATS (Trigonometric regressors, Box-Cox transformations, ARMA errors, Trend, Seasonality)

# TBATS model with weekly and yearly seasonality
dat.ts2 <- sqrt(msts(dat[!is.na(dat$pm2.5),"pm2.5"], seasonal.periods=c(7,365.25)))
fit5 <- tbats(dat.ts2)
# This method takes the most time when comparing run time.
# Down side on this is that you cannot set specific box-cox, ARMA, and fourier parameters.

# Predict future values
plot(predict(fit5, h=25),xlim=c(4.8,5.2))

# Plot the residuals
tbats.mod <- data.frame(day=1:sum(!is.na(dat.ts2)),resid=as.numeric(residuals(fit5)))
ggplot(tbats.mod,aes(day,resid)) + 
  geom_point() + geom_smooth() + 
  ggtitle("TBATS Resids with Dual Seasonality",
          subtitle = paste0("Auto RMSE: ",round(sqrt(mean((residuals(fit5))^2)),2)))

ARIMA with Regressors (AutoRegressive Integraged Moving Average)

# ARIMA with weekly and yearly seasonality with regressors
regs <- dat[!is.na(dat$pm2.5),c("precip","wind","inversion_diff","fireworks")]

# Forecast weather regressors
weather.ts <- msts(dat[,c("precip","wind","inversion_diff")],seasonal.periods = c(7,365.25))
precip <- auto.arima(weather.ts[,1])
fprecip <- as.numeric(data.frame(forecast(precip,h=25))$Point.Forecast)
wind <- auto.arima(weather.ts[,2])
fwind <- as.numeric(data.frame(forecast(wind,h=25))$Point.Forecast)
inversion <- auto.arima(weather.ts[,3])
finversion <- as.numeric(data.frame(forecast(inversion,h=25))$Point.Forecast)

fregs <- data.frame(precip=fprecip,wind=fwind,inversion=as.numeric(finversion<0),fireworks=0)

# Seasonality
z <- fourier(dat.ts2, K=c(2,5))
zf <- fourier(dat.ts2, K=c(2,5), h=25)

# Fit the model
fit <- auto.arima(dat.ts2, xreg=cbind(z,regs), seasonal=FALSE)

# Predict Future Values
# This time we need future values of the regressors as well.
fc <- forecast(fit, xreg=cbind(zf,fregs), h=25)
plot(fc,xlim=c(4.8,5.2))

# Plot the residuals
arima.mod <- data.frame(day=1:sum(!is.na(dat.ts)),resid=as.numeric(residuals(fit)))

ggplot(arima.mod,aes(day,resid)) + 
  geom_point() + geom_smooth() + 
  ggtitle("Arima Resids with Seasonality and Regressors",
          subtitle = paste0("RMSE: ",round(sqrt(mean((residuals(fit))^2)),2)))

prophet

pdat <- data.frame(ds=dat$date,
                   y=sqrt(dat$pm2.5),
                   precip=dat$precip,
                   wind=dat$wind,
                   inversion_diff=dat$inversion_diff,
                   inversion=dat$inversion_,
                   fireworks=dat$fireworks)

# Forecast weather regressors
pfdat <- data.frame(ds=max(dat$date) + 1:25)
pprecip <- pdat %>% 
  select(ds,y=precip) %>% 
  prophet() %>%
  predict(pfdat)
## Initial log joint probability = -5.77805
## Optimization terminated normally: 
##   Convergence detected: relative gradient magnitude is below tolerance
pwind <- pdat %>% 
  select(ds,y=wind) %>% 
  prophet() %>%
  predict(pfdat)
## Initial log joint probability = -46.5575
## Optimization terminated normally: 
##   Convergence detected: relative gradient magnitude is below tolerance
pinversion <- pdat %>% 
  select(ds,y=inversion_diff) %>% 
  prophet() %>%
  predict(pfdat)
## Initial log joint probability = -55.0515
## Optimization terminated normally: 
##   Convergence detected: relative gradient magnitude is below tolerance
fdat <-  data.frame(ds=pfdat$ds,
                    precip=pprecip$yhat,
                    wind=pwind$yhat,
                    inversion=as.numeric(pinversion$yhat<0),
                    fireworks = 0)

# Fit the model (Seasonality automatically determined)
fit6 <- prophet() %>% 
  add_regressor('precip') %>% 
  add_regressor('wind') %>% 
  add_regressor('inversion') %>% 
  add_regressor('fireworks') %>% 
  fit.prophet(pdat)
## Initial log joint probability = -120.752
## Optimization terminated normally: 
##   Convergence detected: relative gradient magnitude is below tolerance
# Forecast future values
forecast <- predict(fit6, fdat)

# Get the residuals
fpred <- predict(fit6)
fpred$ds <- as.Date(fpred$ds)
fpred <- pdat %>% left_join(fpred,by="ds")
fpred$resid <- fpred$y - fpred$yhat

# Plot the residuals
ggplot(fpred,aes(ds,resid)) + 
  geom_point() + geom_smooth() + 
  ggtitle("Prophet with Seasonality and Regressors",
          subtitle = paste0("RMSE: ",round(sqrt(mean(fpred$resid^2)),2)))

Comparison (Show in PowerPoint)

# RMSE by cutoff
all.cv %>% 
  group_by(model,cutoff) %>% 
  summarise(rmse=sqrt(mean((y-yhat)^2))) %>% 
  ggplot(.,aes(x=cutoff,y=rmse,group=model,color=model)) +
  geom_line(alpha=.75) + geom_point(alpha=.75)

# RMSE by horizon
all.cv %>% 
  group_by(model,day) %>% 
  summarise(rmse=sqrt(mean((y-yhat)^2))) %>% 
  ggplot(.,aes(x=day,y=rmse,group=model,color=model)) +
  geom_line(alpha=.75) + geom_point(alpha=.75)

# Prediction behaviors of different methods
ggplot(all.cv,aes(date,yhat,group=as.factor(cutoff),color=as.factor(cutoff)))+
  geom_line()+
  geom_line(aes(y=y),color="black",alpha=.15)+#geom_point(aes(y=y),color="black",alpha=.15)+
  facet_wrap(~model)+ guides(color="none") +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())